Data Processing

First Steps

Read in first data sheet (EDU01a)

library(readr)
sheet1 <- read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
sheet1


Select columns (Area_name as area_name, STCOU, any column ending in “D”)

sheet1 <- sheet1 %>% 
  select(Area_name, STCOU, ends_with("D")) %>%
  rename("area_name" = Area_name)
sheet1


Convert to long format

sheet1 <- sheet1 %>%
  pivot_longer(cols=3:12, names_to="edu", values_to = "enrollment")
sheet1


Parse the new column “edu” to pull out the year and convert to YYYY format Grab the first 7 characters of the new column “edu”

# In the first step we drop the "D" from the "edu" column and then split the resulting column into two new columns - "edu" overwritten to now contain the first three characters and four digits of the old "edu" column, and "yr" which contains the last two digits of the year for the enrollment value
sheet1 <- sheet1 %>%
  mutate(edu=substr(sheet1$edu, 1, 9)) %>%
  separate(edu, into=c("edu_code", "yr"), sep=7, remove = TRUE, convert = FALSE)
# In the second step, we evaluate if the new column "yr" is greater than 50. If so, we paste0 "19" to it's value and force the value to be read as numeric; if the column value is less than 50, we paste "20" to the value and force the value to be read as numeric. 
sheet1 <- sheet1 %>%
  mutate(yr = if_else(sheet1$yr > 50, as.numeric(paste0("19", sheet1$yr, sep="")), as.numeric(paste0("20", sheet1$yr, sep=""))))
sheet1


Create two data sets:

  • county containing only the county-level rows
  • state containing the non-county level rows

Add appropriate class to each

county_df <- sheet1[grep(pattern = ", \\w\\w", sheet1$area_name), ]
state_df <- sheet1[-c(grep(patter = ", \\w\\w", sheet1$area_name)), ]
class(county_df) <- c("county", class(county_df))
class(state_df) <- c("state", class(state_df))
county_df
state_df


Add column containing state information for the county-level tibble

county_df <- county_df %>%
  mutate("state" = substr(county_df$area_name, nchar(county_df$area_name)-1, nchar(county_df$area_name)))
county_df


Add column to non-county tibble corresponding to the state’s classification of division

# created a vector for each regional division containing the names of the states within the division
new_england <- c("CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND", "VERMONT")
mid_atlantic <- c("NEW JERSEY", "NEW YORK", "PENNSYLVANIA")
east_north_central <- c("ILLINOIS", "INDIANA", "MICHIGAN", "OHIO", "WISCONSIN")
west_north_central <- c("IOWA", "KANSAS", "MINNESOTA", "MISSOURI", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA")
# included District of Columbia in the South Atlantic Division since both Maryland and Virginia are in this division
south_atlantic <- c("DELAWARE", "FLORIDA", "GEORGIA", "MARYLAND", "NORTH CAROLINA", "SOUTH CAROLINA", "VIRGINIA", "DISTRICT OF COLUMBIA", "WEST VIRGINIA", "District of Columbia")
east_south_central <- c("ALABAMA", "KENTUCKY", "MISSISSIPPI", "TENNESSEE")
west_south_central <- c("ARKANSAS", "LOUISIANA", "OKLAHOMA", "TEXAS")
mountain <- c("ARIZONA", "COLORADO", "IDAHO", "MONTANA", "NEVADA", "NEW MEXICO", "UTAH", "WYOMING")
pacific <- c("ALASKA", "CALIFORNIA", "HAWAII", "OREGON", "WASHINGTON")
# created division column using if/else logic
state_df <- state_df %>%
  mutate("division" = if_else(area_name %in% new_england, "New England", 
          if_else(area_name %in% mid_atlantic, "Mid Atlantic", 
          if_else(area_name %in% east_north_central, "East North Central", 
          if_else(area_name %in% west_north_central, "West North Central", 
          if_else(area_name %in% south_atlantic, "South Atlantic", 
          if_else(area_name %in% east_south_central, "East South Central", 
          if_else(area_name %in% west_south_central, "West South Central", 
          if_else(area_name %in% mountain, "Mountain", 
          if_else(area_name %in% pacific, "Pacific", "ERROR"))))))))))
state_df

Write Functions

Read in second data sheet (EDU01b)

sheet2 <- read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")


Write a function to select/rename columns and pivot to long format

# included option argument to name column with enrollment data
select_and_pivot <- function(sheet, var_name = "population") {
  df <- sheet %>%
            select(Area_name, STCOU, ends_with("D")) %>%
            rename("area_name" = Area_name) %>%
            pivot_longer(cols=3:12, names_to="edu", values_to = var_name)
  return(df)
}
census <- select_and_pivot(sheet2, var_name = "enroll")
census


Write a function that takes the output from select_and_pivot() and creates a year column and a census description column (edu_code)

create_edu_yr <- function(sheet) {
  dfa <- sheet %>% 
           mutate(edu=substr(sheet$edu, 1, 9)) %>%
           separate(edu, into=c("edu_code", "yr"), sep=7, remove = TRUE, convert = FALSE)
  dfb <- dfa %>%
           mutate(yr = if_else(dfa$yr > 50, as.numeric(paste0("19", dfa$yr, sep="")), as.numeric(paste0("20", dfa$yr, sep=""))))
  return(dfb)
}
census <- create_edu_yr(census)
census


Write a function that will add a state information column (state) to the county-level tibble

create_state <- function(sheet) {
  df <- sheet %>%
    mutate("state" = substr(sheet$area_name, nchar(sheet$area_name)-1, nchar(sheet$area_name)))
  return(df)
}

Write a function that will add a division classification column (division) to the non-county tibble

create_division <- function(sheet) {
  df <- sheet %>%
    mutate("division" = if_else(area_name %in% new_england, "New England", 
            if_else(area_name %in% mid_atlantic, "Mid Atlantic", 
            if_else(area_name %in% east_north_central, "East North Central", 
            if_else(area_name %in% west_north_central, "West North Central", 
            if_else(area_name %in% south_atlantic, "South Atlantic", 
            if_else(area_name %in% east_south_central, "East South Central", 
            if_else(area_name %in% west_south_central, "West South Central", 
            if_else(area_name %in% mountain, "Mountain", 
            if_else(area_name %in% pacific, "Pacific", "ERROR"))))))))))
  return(df)
}


Write a function that takes the output from create_edu_yr() and creates two tibbes - one for county rows and another for non-county rows. Add state column to county tibble and add division classification to non-county tibble. Return one object with two tibbles

split_county_state <- function(sheet) {
  county <- sheet[grep(pattern = ", \\w\\w", sheet$area_name), ]
  class(county) <- c("county", class(county))
  county <- create_state(county)
  state <- sheet[-c(grep(patter = ", \\w\\w", sheet$area_name)), ]
  class(state) <- c("state", class(state))
  state <- create_division(state)
  return(list(county = county, state = state))
}
census <- split_county_state(census)
census$county
census$state


Write a wrapper function that will take in a URL address to one CSV file and return two tibbles in a single object

process_data <- function(url, var_name = "population") {
  a <- read_csv(url)
  b <- select_and_pivot(a, var_name)
  c <- create_edu_yr(b)
  d <- split_county_state(c)
  return(d)
}

Call It & Combine

Call the wrapper function twice, once for each census file (EDU01a & EDU01b)

edu_01a <- process_data("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv", var_name = "pop")
edu_01a
## $county
## # A tibble: 31,450 × 6
##    area_name   STCOU edu_code    yr   pop state
##    <chr>       <chr> <chr>    <dbl> <dbl> <chr>
##  1 Autauga, AL 01001 EDU0101   1987  6829 AL   
##  2 Autauga, AL 01001 EDU0101   1988  6900 AL   
##  3 Autauga, AL 01001 EDU0101   1989  6920 AL   
##  4 Autauga, AL 01001 EDU0101   1990  6847 AL   
##  5 Autauga, AL 01001 EDU0101   1991  7008 AL   
##  6 Autauga, AL 01001 EDU0101   1992  7137 AL   
##  7 Autauga, AL 01001 EDU0101   1993  7152 AL   
##  8 Autauga, AL 01001 EDU0101   1994  7381 AL   
##  9 Autauga, AL 01001 EDU0101   1995  7568 AL   
## 10 Autauga, AL 01001 EDU0101   1996  7834 AL   
## # ℹ 31,440 more rows
## 
## $state
## # A tibble: 530 × 6
##    area_name     STCOU edu_code    yr      pop division
##    <chr>         <chr> <chr>    <dbl>    <dbl> <chr>   
##  1 UNITED STATES 00000 EDU0101   1987 40024299 ERROR   
##  2 UNITED STATES 00000 EDU0101   1988 39967624 ERROR   
##  3 UNITED STATES 00000 EDU0101   1989 40317775 ERROR   
##  4 UNITED STATES 00000 EDU0101   1990 40737600 ERROR   
##  5 UNITED STATES 00000 EDU0101   1991 41385442 ERROR   
##  6 UNITED STATES 00000 EDU0101   1992 42088151 ERROR   
##  7 UNITED STATES 00000 EDU0101   1993 42724710 ERROR   
##  8 UNITED STATES 00000 EDU0101   1994 43369917 ERROR   
##  9 UNITED STATES 00000 EDU0101   1995 43993459 ERROR   
## 10 UNITED STATES 00000 EDU0101   1996 44715737 ERROR   
## # ℹ 520 more rows
edu_01b <- process_data("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv", var_name = "pop")
edu_01b
## $county
## # A tibble: 31,450 × 6
##    area_name   STCOU edu_code    yr   pop state
##    <chr>       <chr> <chr>    <dbl> <dbl> <chr>
##  1 Autauga, AL 01001 EDU0101   1997  8099 AL   
##  2 Autauga, AL 01001 EDU0101   1998  8211 AL   
##  3 Autauga, AL 01001 EDU0101   1999  8489 AL   
##  4 Autauga, AL 01001 EDU0102   2000  8912 AL   
##  5 Autauga, AL 01001 EDU0102   2001  8626 AL   
##  6 Autauga, AL 01001 EDU0102   2002  8762 AL   
##  7 Autauga, AL 01001 EDU0152   2003  9105 AL   
##  8 Autauga, AL 01001 EDU0152   2004  9200 AL   
##  9 Autauga, AL 01001 EDU0152   2005  9559 AL   
## 10 Autauga, AL 01001 EDU0152   2006  9652 AL   
## # ℹ 31,440 more rows
## 
## $state
## # A tibble: 530 × 6
##    area_name     STCOU edu_code    yr      pop division
##    <chr>         <chr> <chr>    <dbl>    <dbl> <chr>   
##  1 UNITED STATES 00000 EDU0101   1997 44534459 ERROR   
##  2 UNITED STATES 00000 EDU0101   1998 46245814 ERROR   
##  3 UNITED STATES 00000 EDU0101   1999 46368903 ERROR   
##  4 UNITED STATES 00000 EDU0102   2000 46818690 ERROR   
##  5 UNITED STATES 00000 EDU0102   2001 47127066 ERROR   
##  6 UNITED STATES 00000 EDU0102   2002 47606570 ERROR   
##  7 UNITED STATES 00000 EDU0152   2003 48506317 ERROR   
##  8 UNITED STATES 00000 EDU0152   2004 48693287 ERROR   
##  9 UNITED STATES 00000 EDU0152   2005 48978555 ERROR   
## 10 UNITED STATES 00000 EDU0152   2006 49140702 ERROR   
## # ℹ 520 more rows


Write a function to combine the county and non-county tibbles of the above outputs

combine_data <- function(data_1, data_2) {
  a <- bind_rows(data_1$county, data_2$county)
  b <- bind_rows(data_1$state, data_2$state)
  return(list(county = a,state = b))
}


Call combine function

edu_01 <- combine_data(edu_01a, edu_01b)
edu_01$county
edu_01$state

Summarizing

state plotting method:

# calculates the mean of the var_name (population) by regional division and year
# filters out all rows were division == "ERROR"
plot.state <- function(df, var_name = "population") {
  new_df <- df %>%
    group_by(division, yr) %>% 
    summarise(ENRmean = mean(get(var_name))) %>%
    filter(division != "ERROR")
  
# plots a line graph of the mean var_name (population) by year for each regional division
  ggplot(new_df, aes(x = yr, y = ENRmean, color = division)) + 
    geom_line() + scale_y_continuous(labels = scales::comma) + ylab(var_name)
}


county plotting method

# order defaults to "top", so if order argument is left blank or if "top" is entered, the function will return dataframe in descending order. If "bottom" is entered for the argument, the dataframe will return in ascending order. If anything other than "top" or "bottom" is entered, the function will STOP and request user to enter appropriate argument
plot.county <- function(df, var_name = "population", st = "IL", order = "top", n = 5) {
  if(order == "top"){
  new_df <- df %>%
    filter(state == st) %>%
    group_by(area_name) %>%
    summarise(ENRmean = mean(get(var_name))) %>%
    arrange(desc(ENRmean))
  }
  else if(order == "bottom"){
    new_df <- df %>%
      filter(state == st) %>%
      group_by(area_name) %>%
      summarise(ENRmean = mean(get(var_name))) %>%
      arrange(ENRmean)
  }
  else{
    stop("Specify Order (top or bottom)")
  }
# the above function returns the highest or lowest means of var_name (population) grouped by county for the state entered in the argument.
# the below function filters the original input dataframe based on the area_names of the above results, so that the new dataframe (filter_df) contains non-aggregated data for the counties identified in the above results
  new_df <- new_df[1:n, ]
  filter_df <- df %>%
    filter(area_name %in% new_df$area_name)
  
# plots a line graph of the var_name (population) for each year for the top or bottom 'n' counties for the state entered in the function argument  
  ggplot(filter_df, aes(x = yr, y = get(var_name), color = area_name)) +
    geom_line() + scale_y_continuous(labels = scales::comma) + ylab(var_name)
}

Put It Together

EDU URLs

Run the data processing function on the two EDU URLs

edu_01a <- process_data("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv", var_name = "pop")
edu_01b <- process_data("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv", var_name = "pop")


Combine function

edu_01 <- combine_data(edu_01a, edu_01b)


Plot state dataframe

plot(edu_01$state, var_name = "pop")


Plot county dataframe - North Carolina

plot(edu_01$county, "pop", "NC", "top", 10)


Plot county dataframe - Arizona

plot(edu_01$county, "pop", "AZ", "bottom", 6)


Plot county dataframe - defaults

# Have to specify var_name since the plotting default ("population") would not match data source ("pop")
plot(edu_01$county, "pop")


Plot county dataframe - Ohio

plot(edu_01$county, "pop", "OH", "top", 8)

PST URLs

Run the data processing function on the four PST URLs

pst_01a <- process_data("https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv")
pst_01b <- process_data("https://www4.stat.ncsu.edu/~online/datasets/PST01b.csv")
pst_01c <- process_data("https://www4.stat.ncsu.edu/~online/datasets/PST01c.csv")
pst_01d <- process_data("https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv")


Combine function

pst_01 <- combine_data(pst_01a, pst_01b)
pst_01 <- combine_data(pst_01c, pst_01)
pst_01 <- combine_data(pst_01d, pst_01)


Plot state dataframe

plot(pst_01$state)


Plot county dataframe - Pennsylvania

plot(pst_01$county,,"PA",, 5)


Plot county dataframe - Texas

plot(pst_01$county,,"TX","bottom", 12)


Plot county dataframe - default

plot(pst_01$county)


Plot county dataframe - New York

plot(pst_01$county,,"NY",, 6)